4.2. Segmentation with Python (I)

Code adapted from: https://www.kaggle.com/arnavkj95/candidate-generation-and-luna16-preprocessing

Now we will do the segmentation but instead of using R, we will be using Python.

First we need to import some Python libraries.

import numpy as np # pip3 install numpy
import pandas as pd # pip3 install pandas
# pip3 install matplotlib
# pip3 install scipy
import skimage # pip3 install scikit-image
import os 
from skimage.morphology import ball, disk, dilation, binary_erosion, remove_small_objects, erosion, closing, reconstruction, binary_closing
from skimage.measure import label,regionprops, perimeter
from skimage.morphology import binary_dilation, binary_opening
from skimage.filters import roberts, sobel
from skimage import measure, feature
from skimage.segmentation import clear_border
from skimage import data
from scipy import ndimage as ndi
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
import dicom # pip3 install dicom
import scipy.misc
import pydicom # pip3 install pydicom
import matplotlib.pyplot as plt

Each scan consist in multiple slices. We have all the DICOM images from the scan in one folder. In path_images we indicate the path of the folder.

from subprocess import check_output
path_images = "/Users/andrealetaalfonso/Desktop/TFG/images/Kaggle/ID00184637202242062969203/"
# You can check that everything is loading correctly with: print(check_output(["ls", path_images]).decode("utf8"))

To read the images, we will use the function pydicom.read_file(). Then we will update the intensity values of -2000 with 0. These pixels are the ones that are located outside the scanner bounds.

# pip3 install nltk==3.6.2
lung = pydicom.read_file("/Users/andrealetaalfonso/Desktop/TFG/images/Kaggle/ID00184637202242062969203/20.dcm")
slice = lung.pixel_array
plt.axis('off')
slice[slice == -2000] = 0
plt.imshow(slice, cmap=plt.cm.gray)
plt.show()

We create a function file_is_hidden() to read only .dcm files and not hidden files in the folder that we cannot see in our computers.

if os.name == 'nt':
    import win32api, win32con
def file_is_hidden(p):
    if os.name== 'nt':
        attribute = win32api.GetFileAttributes(p)
        return attribute & (win32con.FILE_ATTRIBUTE_HIDDEN | win32con.FILE_ATTRIBUTE_SYSTEM)
    else:
        return p.startswith('.') #linux-osx
file_list = [f for f in os.listdir(path_images) if not file_is_hidden(f)] 

Now we will read all the images from a folder with a function named read_ct_scan(folder_name).

def read_ct_scan(folder_name):
  # Read the slices from the dicom file
  slices = [pydicom.read_file(folder_name + filename) for filename in os.listdir(folder_name) if not file_is_hidden(filename)]
  # Sort the dicom slices in their respective order
  slices.sort(key=lambda x: int(x.InstanceNumber))
  # Get the pixel values for all the slices
  slices = np.stack([s.pixel_array for s in slices])
  slices[slices == -2000] = 0
  return slices
  
ct_scan = read_ct_scan(path_images) 

Plot some of the images from a folder.

def plot_ct_scan(scan):
    f, plots = plt.subplots(int(scan.shape[0] / 20) + 1, 4, figsize=(25, 25))
    for i in range(0, scan.shape[0], 5):
        plots[int(i / 20), int((i % 20) / 5)].axis('off')
        plots[int(i / 20), int((i % 20) / 5)].imshow(scan[i], cmap=plt.cm.gray) 

plot_ct_scan(ct_scan)
plt.show()

Now we will do the lung segmentation of 1 image.

def get_segmented_lungs2(im, plot=False):
    if plot == True:
        f, plots = plt.subplots()
    # Step 1: Convert into a binary image. 
    binary = im < 604
    # Step 2: Remove the blobs connected to the border of the image.
    cleared = clear_border(binary)
    # Step 3: Label the image.
    label_image = label(cleared)
    # Step 4: Keep the labels with 2 largest areas.
    areas = [r.area for r in regionprops(label_image)]
    areas.sort()
    if len(areas) > 2:
        for region in regionprops(label_image):
            if region.area < areas[-2]:
                for coordinates in region.coords:                
                       label_image[coordinates[0], coordinates[1]] = 0
    binary = label_image > 0
    # Step 5: Erosion operation with a disk of radius 2. This operation is seperate the lung nodules attached to the blood vessels.
    selem = disk(2)
    binary = binary_erosion(binary, selem)
    # Step 6: Closure operation with a disk of radius 10. This operation is to keep nodules attached to the lung wall.
    selem = disk(10)
    binary = binary_closing(binary, selem)
    # Step 7: Fill in the small holes inside the binary mask of lungs.
    edges = roberts(binary)
    binary = ndi.binary_fill_holes(edges)
    if plot == True:
        plots.axis('off')
        plots.imshow(binary, cmap=plt.cm.bone) 
    # Step 8: Superimpose the binary mask on the input image.
    get_high_vals = binary == 0
    im[get_high_vals] = 0
    return im

Image

Mask of the image

get_segmented_lungs2(ct_scan[20], True)
plt.show()

The steps to obtain the segmentation are the following.

Now we will briefly explain the process of radiomics.

  1. First we convert de image into a binary image. This means that every pixel of the image has only one of two possible values. One black and one white.
  2. We remove the spots connected to the edge of the image.
  3. Create a label.
  4. Keep the labels with two largest areas.
  5. Seperate the lung nodules attached to the blood vessels.
  6. Keep nodules attached to the lung wall.
  7. Fill in the small holes inside the binary mask of lungs.
  8. Superimpose the binary mask on the input image.

We can do the segmentation of all the slices of the scan.

def segment_lung_from_ct_scan(ct_scan):
    return np.asarray([get_segmented_lungs2(slice) for slice in ct_scan])
segmented_ct_scan2 = segment_lung_from_ct_scan(ct_scan)
plot_ct_scan(segmented_ct_scan2)
plt.show()


4.3. Segmentation with Python (II)

Code adapted from: https://www.raddq.com/dicom-processing-segmentation-visualization-in-python/

We will study another way to perform segmentation of lung scans using Python.

As before, first we need to import some Python libraries.

import numpy as np
import dicom
import os
import matplotlib.pyplot as plt
from glob import glob
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
import scipy.ndimage
from skimage import morphology
from skimage import measure
from skimage.transform import resize
# pip3 install scikit-learn
from sklearn.cluster import KMeans
# pip3 install plotly
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.tools import FigureFactory as FF
from plotly.graph_objs import *
import pydicom

Specify the path where the folder with all the images is, and the path where we want our mask images saved.

data_path = "/Users/andrealetaalfonso/Desktop/TFG/images/Kaggle/ID00184637202242062969203/"
output_path = working_path = "/Users/andrealetaalfonso/Desktop/TFG/"

Read all the DICOM files. To do so, we use glob, which is used to return all file paths that match a specific pattern.

g = glob(data_path + '/*.dcm')

Print out the first 5 file names to verify we are in the right folder.

print("Total of %d DICOM images.\nFirst 5 filenames:" % len(g))
## Total of 62 DICOM images.
## First 5 filenames:
print('\n'.join(g[:5]))
## /Users/andrealetaalfonso/Desktop/TFG/images/Kaggle/ID00184637202242062969203/16.dcm
## /Users/andrealetaalfonso/Desktop/TFG/images/Kaggle/ID00184637202242062969203/17.dcm
## /Users/andrealetaalfonso/Desktop/TFG/images/Kaggle/ID00184637202242062969203/15.dcm
## /Users/andrealetaalfonso/Desktop/TFG/images/Kaggle/ID00184637202242062969203/29.dcm
## /Users/andrealetaalfonso/Desktop/TFG/images/Kaggle/ID00184637202242062969203/28.dcm

Now we loop over the image files and store everything into a list.

def load_scan(path):
    slices = [pydicom.read_file(path + '/' + s) for s in os.listdir(path) if not file_is_hidden(s)]
    slices.sort(key = lambda x: int(x.InstanceNumber))
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
    for s in slices:
        s.SliceThickness = slice_thickness
    return slices

The voxel values in the images are raw. This function converts raw values into Houndsfeld units (a quantitative scale for describing radiodensity).

def get_pixels_hu(scans):
    image = np.stack([s.pixel_array for s in scans])
    # Convert to int16 (from sometimes int16), should be possible as values should always be low enough (<32k)
    image = image.astype(np.int16)
    # Set outside-of-scan pixels to 1. The intercept is usually -1024, so air is approximately 0
    image[image == -2000] = 0
    # Convert to Hounsfield units (HU)
    intercept = scans[0].RescaleIntercept
    slope = scans[0].RescaleSlope
    if slope != 1:
        image = slope * image.astype(np.float64)
        image = image.astype(np.int16)
    image += np.int16(intercept)
    return np.array(image, dtype=np.int16)
id = 0
patient = load_scan(data_path) 
imgs = get_pixels_hu(patient) 

Save in .npy format

np.save(output_path + "fullimages_%d.npy" % (id), imgs)

Now we will check if the Houndsfeld Units (HU) are properly scaled and represented. HU are very important because they are standardized across all CT scans. To give a few examples, air is −1000, lung −500, fat −100 to −50, water 0, blood +30 to +70, muscle +10 to +40 and so on.

Let’s display an histogram to check HU.

file_used=output_path+"fullimages_%d.npy" % id
imgs_to_process = np.load(file_used).astype(np.float64) 
plt.hist(imgs_to_process.flatten(), bins=50, color='c')
plt.xlabel("Hounsfield Units (HU)")
plt.ylabel("Frequency")
plt.show()

With the histogram, we can see that there is a lot of air and water. An abundance of lung, fat, bood and muscle.

We obtain the conclusions that we will need to do preprocessing to study well the lungs only, because there is a lot of air and water.

Now we display some of the images, we will be skipping every two slices.

id = 0
imgs_to_process = np.load(output_path+'fullimages_{}.npy'.format(id))

def sample_stack(stack, rows=4, cols=4, start_with=5, show_every=2):
    fig,ax = plt.subplots(rows,cols,figsize=[12,12])
    for i in range(rows*cols):
        ind = start_with + i*show_every
        ax[int(i/rows),int(i % rows)].set_title('slice %d' % ind)
        ax[int(i/rows),int(i % rows)].imshow(stack[ind],cmap='gray')
        ax[int(i/rows),int(i % rows)].axis('off')
    plt.show()

sample_stack(imgs_to_process)

We can see a lot of gray that represents air.

Let’s see how thick each slice is.

print("Slice Thickness: %f" % patient[0].SliceThickness)
## Slice Thickness: 5.000000
print("Pixel Spacing (row, col): (%f, %f) " % (patient[0].PixelSpacing[0], patient[0].PixelSpacing[1]))
## Pixel Spacing (row, col): (0.722656, 0.722656)

Now we do resampling. We want that each slice is resampled in 1x1x1 mm pixels and slices, because it will be useful to obtain the 3d image.

id = 0
imgs_to_process = np.load(output_path+'fullimages_{}.npy'.format(id))
def resample(image, scan, new_spacing=[1,1,1]):
    # Determine current pixel spacing
    spacing = map(float, ([scan[0].SliceThickness] + list(scan[0].PixelSpacing)))
    spacing = np.array(list(spacing))
    resize_factor = spacing / new_spacing
    new_real_shape = image.shape * resize_factor
    new_shape = np.round(new_real_shape)
    real_resize_factor = new_shape / image.shape
    new_spacing = spacing / real_resize_factor
    image = scipy.ndimage.interpolation.zoom(image, real_resize_factor)
    return image, new_spacing
print ("Shape before resampling\t", imgs_to_process.shape)
## Shape before resampling   (62, 512, 512)
imgs_after_resamp, spacing = resample(imgs_to_process, patient, [1,1,1])
print ("Shape after resampling\t", imgs_after_resamp.shape)
## Shape after resampling    (310, 370, 370)

3D Plotting

def make_mesh(image, threshold=-300, step_size=1):
    p = image.transpose(2,1,0)
    verts, faces, norm, val = measure.marching_cubes(p, threshold, step_size=step_size, allow_degenerate=True) 
    return verts, faces
def plt_3d(verts, faces):
    x,y,z = zip(*verts) 
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111, projection='3d')
    # Fancy indexing: `verts[faces]` to generate a collection of triangles
    mesh = Poly3DCollection(verts[faces], linewidths=0.05, alpha=1)
    face_color = [1, 1, 0.9]
    mesh.set_facecolor(face_color)
    ax.add_collection3d(mesh)
    ax.set_xlim(0, max(x))
    ax.set_ylim(0, max(y))
    ax.set_zlim(0, max(z))
    ax.set_facecolor((0.7, 0.7, 0.7)) # canvio depricated version!
    plt.show()
v, f = make_mesh(imgs_after_resamp, 350)
plt_3d(v, f)

Segmentation

Now we will do the segmentation of the lungs.

# Standardize the pixel values
def make_lungmask(img, display=False):
    row_size= img.shape[0]
    col_size = img.shape[1]
    mean = np.mean(img)
    std = np.std(img)
    img = img-mean
    img = img/std
    # Find the average pixel value near the lungs to renormalize washed out images
    middle = img[int(col_size/5):int(col_size/5*4),int(row_size/5):int(row_size/5*4)] 
    mean = np.mean(middle)  
    max = np.max(img)
    min = np.min(img)
    # To improve threshold finding, I'm moving the underflow and overflow on the pixel spectrum
    img[img==max]=mean
    img[img==min]=mean
    # Using Kmeans to separate foreground (soft tissue / bone) and background (lung/air)
    kmeans = KMeans(n_clusters=2).fit(np.reshape(middle,[np.prod(middle.shape),1]))
    centers = sorted(kmeans.cluster_centers_.flatten())
    threshold = np.mean(centers)
    thresh_img = np.where(img<threshold,1.0,0.0)  # threshold the image
    # First erode away the finer elements, then dilate to include some of the pixels surrounding the lung.  
    # We don't want to accidentally clip the lung.
    eroded = morphology.erosion(thresh_img,np.ones([3,3]))
    dilation = morphology.dilation(eroded,np.ones([8,8]))
    labels = measure.label(dilation) # Different labels are displayed in different colors
    label_vals = np.unique(labels)
    regions = measure.regionprops(labels)
    good_labels = []
    for prop in regions:
        B = prop.bbox
        if B[2]-B[0]<row_size/10*9 and B[3]-B[1]<col_size/10*9 and B[0]>row_size/5 and B[2]<col_size/5*4:
            good_labels.append(prop.label)
    mask = np.ndarray([row_size,col_size],dtype=np.int8)
    mask[:] = 0
    # After just the lungs are left, we do another large dilation in order to fill in and out the lung mask 
    for N in good_labels:
        mask = mask + np.where(labels==N,1,0)
    mask = morphology.dilation(mask,np.ones([10,10])) # one last dilation
    if (display):
        fig, ax = plt.subplots(3, 2, figsize=[12, 12])
        ax[0, 0].set_title("Original")
        ax[0, 0].imshow(img, cmap='gray')
        ax[0, 0].axis('off')
        ax[0, 1].set_title("Threshold")
        ax[0, 1].imshow(thresh_img, cmap='gray')
        ax[0, 1].axis('off')
        ax[1, 0].set_title("After Erosion and Dilation")
        ax[1, 0].imshow(dilation, cmap='gray')
        ax[1, 0].axis('off')
        ax[1, 1].set_title("Color Labels")
        ax[1, 1].imshow(labels)
        ax[1, 1].axis('off')
        ax[2, 0].set_title("Final Mask")
        ax[2, 0].imshow(mask, cmap='gray')
        ax[2, 0].axis('off')
        ax[2, 1].set_title("Apply Mask on Original")
        ax[2, 1].imshow(mask*img, cmap='gray')
        ax[2, 1].axis('off')
        plt.show()
    return mask*img

We know obtain the segmentation to only one image.

img = imgs_after_resamp[63]
make_lungmask(img, display=True)

Anf finally we apply it to all the slices.

masked_lung = []
for img in imgs_after_resamp:
    masked_lung.append(make_lungmask(img))
sample_stack(masked_lung, show_every=10)

Save masks in .npy format.

np.save(output_path + "maskedimages_%d.npy" % (id), imgs)